{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "A random forest classifier aimed at determining whether a stock will be higher or lower after some given amount of days.\n",
    "Replication of Khaidem, Saha, & Roy Dey (2016)\n",
    "\n",
    "Documentation on function:\n",
    "http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n",
    "\"\"\"\n",
    "\n",
    "import pandas as pd\n",
    "from sklearn.ensemble import RandomForestClassifier as make_forest\n",
    "from sklearn.metrics import mean_squared_error as mse\n",
    "from sklearn.metrics import accuracy_score as acc\n",
    "import numpy as np\n",
    "import tqdm\n",
    "\n",
    "'''\n",
    "### Outline ###\n",
    "We have a bunch of columns of different length target values\n",
    "We drop all target values except the ones we want to analyze (or else when we remove NA we will remove too much data)\n",
    "We then input the data and features in to the first .fit parameter, and the labels in the second\n",
    "'''\n",
    "criterion=\"gini\"\n",
    "num_features = 6\n",
    "n_estimators = 65\n",
    "prediction_window = 1\n",
    "oob_score = True\n",
    "full_data = pd.read_csv('data_preprocessed.csv')\n",
    "train_labels = [\"Close_detrend\",\"Volume\",\"EWMA\", \"SO\",\"WR\",\"RSI\"]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# drop all target columns not to be analyzed\n",
    "#headers = full_data.columns.values\n",
    "#headers = headers[13:] # should return just the headers of the target values\n",
    "#headers = headers[headers!='Target({})'.format(prediction_window)]\n",
    "#selected_data = full_data.drop(headers, axis=1)\n",
    "selected_data = full_data.dropna(axis=0, how='any') # using the subset parameter might allow us to skip dropping other targets?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "### Drop useless labels ###\n",
    "selected_data.drop([\"Unnamed: 0\"], axis = 1, inplace = True)\n",
    "selected_data.drop([\"Date\"], axis = 1, inplace = True)\n",
    "selected_data.drop([\"Open\",\"High\",\"Low\"], axis = 1, inplace = True)\n",
    "#selected_data.drop([\"Symbol\",\"Open\",\"High\",\"Low\"], axis = 1, inplace = True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def split_x_y(df,train_labels,prediction_window):\n",
    "    x = df[train_labels].as_matrix()\n",
    "    y = df['Target({})'.format(prediction_window)].as_matrix()\n",
    "    \n",
    "    return x,y\n",
    "    \n",
    "def train_on_df(x,y,train_frac):\n",
    "    msk = np.random.rand(len(x)) < train_frac\n",
    "    \n",
    "    train_x = x[msk]\n",
    "    train_y = y[msk]\n",
    "    \n",
    "    test_x = x[~msk]\n",
    "    test_y = y[~msk]\n",
    "    \n",
    "    Random_Forest = make_forest(n_estimators=n_estimators, max_features=num_features, bootstrap=True, oob_score=oob_score, verbose=0,criterion=criterion,n_jobs=-1)\n",
    "    Random_Forest.fit(train_x, train_y)\n",
    "        \n",
    "    \n",
    "    test_accurency = Random_Forest.score(test_x, test_y)\n",
    "    return Random_Forest, test_accurency\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train Modell on each stock and make predictions for 1 and 30 Day\n",
    "## Save them for each Stock into a csv file "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "stock_forests = {}\n",
    "import tqdm\n",
    "num_symboles = len(selected_data['Symbol'].unique())-1\n",
    "for idx,stock in tqdm.tqmd(enumerate(selected_data['Symbol'].unique())):\n",
    "    symbole_df = selected_data[selected_data[\"Symbol\"]==stock]\n",
    "\n",
    "    x1,y1 = split_x_y(symbole_df, train_labels,1)\n",
    "    x30,y30 = split_x_y(symbole_df, train_labels,30)\n",
    "\n",
    "\n",
    "    forest1, accurency1 = train_on_df(x1,y1,0.8)\n",
    "    forest30, accurency30 = train_on_df(x30,y30,0.8)\n",
    "\n",
    "    stock_forests[stock] = {1:{\"acc\":accurency1,\n",
    "                                \"forest\":forest1},\n",
    "                            30:{\"acc\":accurency30,\n",
    "                                \"forest\":forest30}\n",
    "                            }\n",
    "\n",
    "    df_stock = pd.DataFrame()\n",
    "    df_stock[\"Close\"] = symbole_df[\"Close\"]\n",
    "    df_stock[\"Close_detrend\"] = symbole_df[\"Close_detrend\"]\n",
    "    df_stock[\"Target(1)\"] = symbole_df[\"Target(1)\"]\n",
    "    df_stock[\"Target(30)\"] = symbole_df[\"Target(30)\"]\n",
    "    df_stock[\"Prediction(1)\"] = forest1.predict(symbole_df[train_labels].as_matrix())\n",
    "    df_stock[\"Prediction(30)\"] = forest30.predict(symbole_df[train_labels].as_matrix())\n",
    "    df_stock.to_csv(\"results/result_{}.csv\".format(stock))\n",
    "    print(\"Done {}/{}\".format(idx,num_symboles))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create File with acc results for all stocks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "f_all = open(\"results/_ALL.csv\",\"w\")\n",
    "f_all.write(\"Symbole,accPrediction(1),accPrediction(30)\\n\")\n",
    "for symbole, vals in stock_forests.items():\n",
    "    f_all.write(\"{},{},{}\\n\".format(symbole,vals[1][\"acc\"],vals[30][\"acc\"]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train forest over the market"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "x1,y1 = split_x_y(selected_data, train_labels,1)\n",
    "x30,y30 = split_x_y(selected_data, train_labels,30)\n",
    "\n",
    "complete_forest1, complete_acc1 = train_on_df(x1,y1,0.8)\n",
    "complete_forest30, complete_acc30 = train_on_df(x30,y30,0.8)\n",
    "\n",
    "print(\"\\tacc1: {}%\".format(str(round(complete_acc1*100,2))))\n",
    "print(\"\\tacc30: {}%\".format(str(round(complete_acc30*100,2))))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compare complet market against AAPL CAT BA and SBUX"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "to_compare = [\"AAPL\", \"CAT\", \"BA\", \"SBUX\"]\n",
    "complete_against ={}\n",
    "for stock in to_compare:\n",
    "    symbole_df = selected_data[selected_data[\"Symbol\"]==stock]\n",
    "\n",
    "    x1,y1 = split_x_y(symbole_df, train_labels,1)\n",
    "    x30,y30 = split_x_y(symbole_df, train_labels,30)\n",
    "\n",
    "    acc1 = complete_forest1.score(x1,y1)\n",
    "    acc30 = complete_forest30.score(x30,y30)\n",
    "    print(\"For Stock {} against complete:\".format(stock))\n",
    "    print(\"\\tacc1: {}%\".format(str(round(acc1*100,2))))\n",
    "    print(\"\\tacc30: {}%\".format(str(round(acc30*100,2))))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "## Insert one stock in other stock modell"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "to_compare = [\"AAPL\", \"CAT\", \"BA\", \"SBUX\"]\n",
    "for model in to_compare:\n",
    "    for test_stock in to_compare:\n",
    "        if test_stock == model:\n",
    "            continue\n",
    "        symbole_df = selected_data[selected_data[\"Symbol\"]==test_stock]\n",
    "\n",
    "        x1,y1 = split_x_y(symbole_df, train_labels,1)\n",
    "        x30,y30 = split_x_y(symbole_df, train_labels,30)\n",
    "        \n",
    "        acc1 = stock_forests[model][1][\"forest\"].score(x1,y1)\n",
    "        acc30 = stock_forests[model][30][\"forest\"].score(x30,y30)\n",
    "        print(\"For {} in {}-Model\".format(test_stock,model))\n",
    "        print(\"\\tacc1: {}%\".format(str(round(acc1*100,2))))\n",
    "        print(\"\\tacc30: {}%\".format(str(round(acc30*100,2))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#import pickle\n",
    "#f = open(\"complete_forest_v1.pick\",\"wb\")\n",
    "#s = pickle.dump(complete_forest,f)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
